In [1]:
import numpy as np
import seaborn as sb
import pandas as pd
import MyML.cluster.K_Means3 as myKM
import MyML.cluster.eac as eac
import MyML.helper.partition as mpart
import MyML.metrics.accuracy as accuracy
import MyML.cluster.linkage as linkage


/home/chiroptera/anaconda/lib/python2.7/site-packages/numba/cuda/decorators.py:106: UserWarning: autojit is deprecated and will be removed in a future release. Use jit instead.
  warn('autojit is deprecated and will be removed in a future release. Use jit instead.')

Prepare dataset


In [2]:
ls ../datasets/ecg_it/


data_leadI_todos.mat  ecg_it.data  ecg_it.names

In [31]:
foldername = "/home/chiroptera/QCThesis/datasets/" + "ecg_it/"
dataname = foldername + "ecg_it.data"

In [515]:
np.savetxt(foldername + "data.csv", data, delimiter=',')
np.savetxt(foldername + "ground_truth.csv", gt, delimiter=',')

In [32]:
dataset = pd.read_csv(dataname, header=None, sep=",")
print dataset.shape
dataset.head()


(5252, 11)
Out[32]:
0 1 2 3 4 5 6 7 8 9 10
0 NaN 0.000000 1.000000 2.000000 3.000000 4.000000 5.000000 6.000000 7.000000 8.000000 9
1 0 36.323065 127.324016 366.239635 940.892115 1442.984831 875.125756 29.243837 280.000000 50.919775 1
2 1 31.364310 103.658958 248.793275 520.264406 674.872210 528.115100 20.809815 496.888889 53.699668 1
3 2 13.115809 42.837888 98.186910 200.074959 298.550081 276.768066 10.456345 531.333333 9.533566 0
4 3 61.169188 232.372398 773.890215 1802.451147 2282.307487 1784.527244 73.703583 599.857143 14.055284 1

In [24]:
raw = dataset.values[1:,1:]
dataset = pd.DataFrame(raw)
dataset.head()


Out[24]:
0 1 2 3 4 5 6 7 8 9
0 36.323065 127.324016 366.239635 940.892115 1442.984831 875.125756 29.243837 280.000000 50.919775 1
1 31.364310 103.658958 248.793275 520.264406 674.872210 528.115100 20.809815 496.888889 53.699668 1
2 13.115809 42.837888 98.186910 200.074959 298.550081 276.768066 10.456345 531.333333 9.533566 0
3 61.169188 232.372398 773.890215 1802.451147 2282.307487 1784.527244 73.703583 599.857143 14.055284 1
4 44.265739 151.741567 365.092849 458.897507 332.241997 170.878889 9.968925 376.769231 93.317616 0

In [29]:
dataset.to_csv(dataname)

In [664]:
print dataset[64].unique()
print dataset[64].unique().size


[0 7 4 6 2 5 8 1 9 3]
10

In [421]:
dataset[dataset == 'y'] = 1

In [500]:
dataset[9].unique()


Out[500]:
array(['MIT', 'NUC', 'CYT', 'ME1', 'EXC', 'ME2', 'ME3', 'VAC', 'POX', 'ERL'], dtype=object)

In [427]:
dataset[dataset=='republican']=1

In [431]:
dataset = dataset.astype(np.int32)

In [436]:
new_cols = dataset.columns.astype(np.object)

In [416]:
for i in range(1,17):
    dataset = dataset[dataset[i] != '?']

In [417]:
dataset.shape


Out[417]:
(232, 17)

In [400]:
dataset.shape


Out[400]:
(379, 17)

In [518]:
dataset=load_iris()

In [520]:
data=dataset.data.astype(np.float32)
gt=dataset.target

In [667]:
data = dataset.get_values()[:,:-1]

In [668]:
data = data.astype(np.float32)
data.shape


Out[668]:
(3823, 64)

In [669]:
gt=dataset.get_values()[:,-1]

In [670]:
gt = gt.astype(np.int32)

In [506]:
l=0
for i in np.unique(gt):
    gt[gt==i] = l
    l+=1

In [671]:
gt


Out[671]:
array([0, 0, 7, ..., 6, 6, 7], dtype=int32)

In [566]:
gt=np.zeros(600, dtype=np.int32)
for i in range(6):
    gt[i * 100 : i * 100 + 100] = i

iris


In [77]:
from sklearn.datasets import load_iris

iris=load_iris()
data=iris.data.astype(np.float32)
gt=iris.target

ionosphere


In [824]:
foldername = "/home/diogoaos/QCThesis/datasets/" + "ionosphere/"
dataname = foldername + "ionosphere.data"

dataset = pd.read_csv(dataname, header=None, sep=",")
print dataset.shape
dataset.head()

data = dataset.values[:,:-1].astype(np.float32)
gt = dataset.values[:,-1]
gt[gt=='g']=1
gt[gt=='b']=0
gt = gt.astype(np.int32)


(351, 35)

optdigits


In [681]:
foldername = "/home/diogoaos/QCThesis/datasets/" + "optdigits/"
dataname = foldername + "optdigits.tra"

dataset = pd.read_csv(dataname, header=None, sep=",")
data = dataset.get_values()[:,:-1]
data = data.astype(np.float32)
gt=dataset.get_values()[:,-1]
gt = gt.astype(np.int32)

mfeat-fou


In [701]:
foldername = "/home/diogoaos/QCThesis/datasets/" + "mfeat/"
dataname = foldername + "mfeat-fou.asc"

dataset = pd.read_csv(dataname, header=None, sep="  ")
data = dataset.get_values().astype(np.float32)
gt = np.empty(dataset.shape[0], dtype=np.int32)
for i in range(10):
    gt[i*200 : i*200+200]=i

isolet


In [149]:
foldername = "/home/chiroptera/QCThesis/datasets/" + "isolet/"
dataname = foldername + "isolet1-5.data"

dataset = pd.read_csv(dataname, header=None, sep=",")
data = dataset.get_values().astype(np.float32)[:,:-1]
gt=dataset.get_values()[:,-1].astype(np.int32)

ECG IT


In [90]:
foldername = "/home/chiroptera/QCThesis/datasets/" + "ecg_it/"
dataname = foldername + "ecg_it.data"
dataset = pd.read_csv(dataname, header=None, sep=",")
data = dataset.values[1:,1:-1].astype(np.float32)
gt = dataset.values[1:,-1].astype(np.int32)

#remove unlabeled
labeled_idx = gt!=2
data = data[labeled_idx]
gt = gt[labeled_idx]

In [79]:
gt0 = gt==0
gt1 = gt==1

In [80]:
data = np.concatenate((data[gt0],data[gt1][:600]))
gt = np.concatenate((gt[gt0],gt[gt1][:600]))

Process


In [81]:
from sklearn.cluster import KMeans

In [143]:
reload(myKM)


Out[143]:
<module 'MyML.cluster.K_Means3' from '/home/chiroptera/workspace/QCThesis/MyML/cluster/K_Means3.py'>

In [150]:
generator = myKM.K_Means()
generator._MAX_THREADS_BLOCK = 256
generator._PPT = 8
#generator = KMeans(init='random')

n_samples = data.shape[0]
true_n_clusters=np.unique(gt).size

sqrt_nsamples = np.sqrt(n_samples)
n_clusters = [sqrt_nsamples/2, sqrt_nsamples]
n_clusters = map(np.ceil, n_clusters)
n_clusters = map(int, n_clusters)
n_partitions = 100
rounds = 20
max_clusts=100


assoc_mode = "full"
prot_mode = "none"

print "number of samples: ", n_samples
print "interval of clusters: ", n_clusters
print "true number of clusters: ", true_n_clusters
print "number of partitions: ", n_partitions
print "number of rounds: ", rounds
print "maximum clusters assumption: ", max_clusts


number of samples:  7797
interval of clusters:  [45, 89]
true number of clusters:  26
number of partitions:  100
number of rounds:  20
maximum clusters assumption:  100

In [151]:
%time ensemble = mpart.generateEnsemble(data, generator, n_clusters=n_clusters, npartitions=n_partitions, iters=3)


---------------------------------------------------------------------------
CudaAPIError                              Traceback (most recent call last)
<ipython-input-151-39e88b443a2e> in <module>()
----> 1 get_ipython().magic(u'time ensemble = mpart.generateEnsemble(data, generator, n_clusters=n_clusters, npartitions=n_partitions, iters=3)')

/home/chiroptera/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.pyc in magic(self, arg_s)
   2305         magic_name, _, magic_arg_s = arg_s.partition(' ')
   2306         magic_name = magic_name.lstrip(prefilter.ESC_MAGIC)
-> 2307         return self.run_line_magic(magic_name, magic_arg_s)
   2308 
   2309     #-------------------------------------------------------------------------

/home/chiroptera/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.pyc in run_line_magic(self, magic_name, line)
   2226                 kwargs['local_ns'] = sys._getframe(stack_depth).f_locals
   2227             with self.builtin_trap:
-> 2228                 result = fn(*args,**kwargs)
   2229             return result
   2230 

/home/chiroptera/anaconda/lib/python2.7/site-packages/IPython/core/magics/execution.pyc in time(self, line, cell, local_ns)

/home/chiroptera/anaconda/lib/python2.7/site-packages/IPython/core/magic.pyc in <lambda>(f, *a, **k)
    191     # but it's overkill for just that one bit of state.
    192     def magic_deco(arg):
--> 193         call = lambda f, *a, **k: f(*a, **k)
    194 
    195         if callable(arg):

/home/chiroptera/anaconda/lib/python2.7/site-packages/IPython/core/magics/execution.pyc in time(self, line, cell, local_ns)
   1164         else:
   1165             st = clock2()
-> 1166             exec(code, glob, local_ns)
   1167             end = clock2()
   1168             out = None

<timed exec> in <module>()

/home/chiroptera/workspace/QCThesis/MyML/helper/partition.pyc in generateEnsemble(data, generator, n_clusters, npartitions, iters)
    113             generator.n_clusters = k
    114 
--> 115         generator.fit(data)
    116         ensemble[x] = convertClusterStringToIndex(generator.labels_)
    117 

/home/chiroptera/workspace/QCThesis/MyML/cluster/K_Means3.py in fit(self, data)
    111         while not stopcond:
    112             # compute labels
--> 113             labels = self._label(data,self.centroids)
    114 
    115             self.iters_ += 1 #increment iteration counter

/home/chiroptera/workspace/QCThesis/MyML/cluster/K_Means3.py in _label(self, data, centroids)
    182         # we need array for distances to check convergence
    183         if self._mode == "cuda":
--> 184             labels = self._cu_label(data, centroids)
    185         elif self._mode == "special": #for tests only
    186             labels=np.empty(self.N, dtype=np.int32)

/home/chiroptera/workspace/QCThesis/MyML/cluster/K_Means3.py in _cu_label(self, data, centroids)
    444                                                                 centroids,
    445                                                                 labels,
--> 446                                                                 self._dists)
    447 
    448         else:

/home/chiroptera/anaconda/lib/python2.7/site-packages/numba/cuda/compiler.pyc in __call__(self, *args, **kwargs)
    313                           blockdim=self.blockdim,
    314                           stream=self.stream,
--> 315                           sharedmem=self.sharedmem)
    316 
    317     def bind(self):

/home/chiroptera/anaconda/lib/python2.7/site-packages/numba/cuda/compiler.pyc in _kernel_call(self, args, griddim, blockdim, stream, sharedmem)
    415         # retrieve auto converted arrays
    416         for wb in retr:
--> 417             wb()
    418 
    419     def _prepare_args(self, ty, val, stream, retr, kernelargs):

/home/chiroptera/anaconda/lib/python2.7/site-packages/numba/cuda/compiler.pyc in <lambda>()
    424             devary, conv = devicearray.auto_device(val, stream=stream)
    425             if conv:
--> 426                 retr.append(lambda: devary.copy_to_host(val, stream=stream))
    427 
    428             c_intp = ctypes.c_ssize_t

/home/chiroptera/anaconda/lib/python2.7/site-packages/numba/cuda/cudadrv/devicearray.pyc in copy_to_host(self, ary, stream)
    193         assert self.alloc_size >= 0, "Negative memory size"
    194         if self.alloc_size != 0:
--> 195             _driver.device_to_host(hostary, self, self.alloc_size, stream=stream)
    196 
    197         if ary is None:

/home/chiroptera/anaconda/lib/python2.7/site-packages/numba/cuda/cudadrv/driver.pyc in device_to_host(dst, src, size, stream)
   1377         fn = driver.cuMemcpyDtoH
   1378 
-> 1379     fn(host_pointer(dst), device_pointer(src), size, *varargs)
   1380 
   1381 

/home/chiroptera/anaconda/lib/python2.7/site-packages/numba/cuda/cudadrv/driver.pyc in safe_cuda_api_call(*args)
    214         def safe_cuda_api_call(*args):
    215             retcode = libfn(*args)
--> 216             self._check_error(fname, retcode)
    217 
    218         setattr(self, fname, safe_cuda_api_call)

/home/chiroptera/anaconda/lib/python2.7/site-packages/numba/cuda/cudadrv/driver.pyc in _check_error(self, fname, retcode)
    244             errname = ERROR_MAP.get(retcode, "UNKNOWN_CUDA_ERROR")
    245             msg = "Call to %s results in %s" % (fname, errname)
--> 246             raise CudaAPIError(retcode, msg)
    247 
    248     def get_device(self, devnum=0):

CudaAPIError: Call to cuMemcpyDtoH results in UNKNOWN_CUDA_ERROR

In [146]:
print generator._gridDim
print generator._gridDim * generator._MAX_THREADS_BLOCK * generator._PPT


3
6144

sklearn 15.x mykm 2.66 1 ppt mykm


In [84]:
eacEst = eac.EAC(n_samples=n_samples, mat_sparse=False)
%time eacEst.fit(ensemble, assoc_mode=assoc_mode, prot_mode=prot_mode)


CPU times: user 667 ms, sys: 24 ms, total: 691 ms
Wall time: 399 ms

In [85]:
Z = eacEst._apply_linkage()
Z[:,:2]


Out[85]:
array([[  424.,   481.],
       [ 1110.,  1344.],
       [  140.,  1307.],
       ..., 
       [ 2363.,  3113.],
       [ 1043.,  3114.],
       [ 1580.,  3115.]])

In [102]:
true_n_clusters=0

In [86]:
%time labels = eacEst._lifetime_clustering(n_clusters=true_n_clusters)
accEst = accuracy.HungarianIndex(nsamples=n_samples)
%time accEst.score(gt, labels)
print accEst.accuracy


CPU times: user 83.3 ms, sys: 0 ns, total: 83.3 ms
Wall time: 82.4 ms
CPU times: user 17.2 ms, sys: 43 µs, total: 17.3 ms
Wall time: 16.2 ms
0.616420782553

In [66]:
for l in np.unique(gt):
    print l, (gt==l).sum()


0 959
1 959

In [87]:
for l in np.unique(labels):
    print l, (labels==l).sum()


0 2
1 1557

In [67]:
true_n_clusters = 0

In [68]:
thresholds = np.arange(0.05,1.01,0.05)
res = np.empty(((thresholds.size + 1) * rounds, 5))
# threshold, max_assocs, n_assocs, acc
n_cluster_ary = np.empty((thresholds.size + 1) * rounds, dtype=np.int32)

#progress bar
print ". " * rounds

i = 0
for r in range(rounds):
    
    print ".",

    ensemble = mpart.generateEnsemble(data, generator, n_clusters=n_clusters, npartitions=n_partitions, iters=3)

    eacEst = eac.EAC(n_samples=n_samples, mat_sparse=False)
    eacEst.fit(ensemble, assoc_mode=assoc_mode, prot_mode=prot_mode)

    max_assocs, max_idx = eacEst.getMaxAssocs()
    n_assocs = eacEst.getNNZAssocs()
    labels = eacEst._lifetime_clustering(n_clusters=true_n_clusters)
    accEst = accuracy.HungarianIndex(nsamples=n_samples)
    
    # HungarianIndex takes a huge time for a high cluster imbalance
    if np.unique(labels).size > max_clusts:
        accEst.accuracy = -1
    else:
        accEst.score(gt, labels)

    res[i, 0] = 0
    res[i, 1] = max_assocs
    res[i, 2] = n_assocs
    res[i, 3] = accEst.accuracy
    res[i, 4] = r
    n_cluster_ary[i] = np.unique(labels).size
    i += 1

    for j in range(thresholds.size):

        eacEst.apply_threshold(thresholds[j])

        max_assocs, max_idx = eacEst.getMaxAssocs()
        n_assocs = eacEst.getNNZAssocs()
        
        labels = eacEst._lifetime_clustering(n_clusters=true_n_clusters)
        
        accEst = accuracy.HungarianIndex(nsamples=n_samples)
        
        if np.unique(labels).size > max_clusts:
            accEst.accuracy = -1
        else:
            accEst.score(gt, labels)        

        max_assocs, max_idx = eacEst.getMaxAssocs()
        nnz_pc = eacEst.getNNZAssocs()

        res[i, 0] = thresholds[j]
        res[i, 1] = max_assocs
        res[i, 2] = nnz_pc
        res[i, 3] = accEst.accuracy
        res[i, 4] = r
        
        n_cluster_ary[i] = np.unique(labels).size
        
        i += 1

resPD = pd.DataFrame(data=res, columns=["threshold", "max assoc", "n assocs", "accuracy", "round"])
print "MAX ACCURACY=", resPD['accuracy'].max()
if true_n_clusters == 0:
    resPD['n_clusts']=n_cluster_ary
resPD


. . . . . . . . . . . . . . . . . . . . 
. . . . . . . . . . . . . . . . . . . . MAX ACCURACY= 0.777623349548
Out[68]:
threshold max assoc n assocs accuracy round n_clusts
0 0.00 1161 2992955 0.777623 0 2
1 0.05 756 1874437 0.777623 0 2
2 0.10 569 1412169 0.777623 0 2
3 0.15 479 1129645 0.777623 0 2
4 0.20 387 923003 0.777623 0 2
5 0.25 321 761271 0.777623 0 2
6 0.30 264 629641 0.777623 0 2
7 0.35 233 518523 0.777623 0 2
8 0.40 207 424195 0.777623 0 2
9 0.45 190 344023 0.777623 0 2
10 0.50 169 274889 0.777392 0 3
11 0.55 145 216929 0.776928 0 5
12 0.60 116 167605 0.773917 0 15
13 0.65 100 125645 0.766273 0 43
14 0.70 81 91273 0.755386 0 86
15 0.75 62 63223 -1.000000 0 206
16 0.80 51 41579 -1.000000 0 504
17 0.85 36 25401 -1.000000 0 1163
18 0.90 23 14269 -1.000000 0 2179
19 0.95 13 7765 -1.000000 0 3246
20 1.00 5 5271 -1.000000 0 3898
21 0.00 1157 2907841 0.777392 1 3
22 0.05 685 1823803 0.777392 1 3
23 0.10 558 1377829 0.777392 1 3
24 0.15 450 1105509 0.777392 1 3
25 0.20 365 907021 0.777392 1 3
26 0.25 300 750443 0.777392 1 3
27 0.30 255 620843 0.777392 1 3
28 0.35 231 513063 0.777392 1 3
29 0.40 204 420757 0.777392 1 3
... ... ... ... ... ... ...
390 0.60 139 181587 0.774612 18 14
391 0.65 114 136273 0.768589 18 28
392 0.70 85 99049 0.747510 18 87
393 0.75 65 69385 -1.000000 18 193
394 0.80 47 45991 -1.000000 18 479
395 0.85 37 28347 -1.000000 18 1086
396 0.90 26 15883 -1.000000 18 2048
397 0.95 13 8129 -1.000000 18 3148
398 1.00 4 5269 -1.000000 18 3891
399 0.00 1114 3015729 0.777623 19 2
400 0.05 711 1919825 0.777623 19 2
401 0.10 541 1450437 0.777623 19 2
402 0.15 427 1166765 0.777623 19 2
403 0.20 361 958693 0.777623 19 2
404 0.25 294 794781 0.777623 19 2
405 0.30 260 659607 0.777623 19 2
406 0.35 240 545229 0.777623 19 2
407 0.40 216 447497 0.777623 19 2
408 0.45 186 363527 0.777623 19 2
409 0.50 156 291901 0.777392 19 3
410 0.55 122 229567 0.774612 19 7
411 0.60 105 177419 0.773454 19 13
412 0.65 85 133807 0.767199 19 38
413 0.70 75 97603 0.757239 19 73
414 0.75 63 68581 -1.000000 19 212
415 0.80 50 45281 -1.000000 19 451
416 0.85 42 27833 -1.000000 19 1058
417 0.90 25 15537 -1.000000 19 2070
418 0.95 18 8125 -1.000000 19 3177
419 1.00 5 5279 -1.000000 19 3895

420 rows × 6 columns


In [922]:
print dataname


/home/diogoaos/QCThesis/datasets/ionosphere/ionosphere.data

In [923]:
save_folder = "/home/diogoaos/QCThesis/experiments/threshold/"
save_name = "ionosphere_lifetime"

In [924]:
resPD.to_csv(path_or_buf = save_folder + save_name + ".csv", index=False)